#encoding=utf-8
from spider.core.Opener import Opener
import jieba.analyse
import re
from analysis.core.db import db
class my(db):
    def run(self):
        self.cursor.execute("select content,url from news ");
        rows = self.cursor.fetchall()
        for row in rows:
            content = row[0]
            if not content:
                print("bad:------")
            content = re.subn('[a-z0-9A-Z\s,<.>/?;:\'"\[{\]}+-_=`~!@#$%^&*()]+',' ',content)[0]
            content = re.subn('<style[\s\S]+?</style>','',content)[0]
            content = re.subn('<!--[\s\S]+?-->','',content)[0]
            content = re.subn('<script[\s\S]+?</script>','',content)[0]
            tags = jieba.analyse.extract_tags(content,4)
            # if not ",".join(tags):
            #     print(row[0])
            #     return
            print ",".join(tags),row[1]
a = my()
a.run()